The goal of this notebook is to visualize the phylogenetic
relationship of the haplotypes identified in the previous notebooks.
I’ll read in the tree’s from SPRUCE as implemented by
MACHINA and make a nicely formatted tree where branch
length corresponds to mutations. The output of this notebook is
a final filtered set of cluster relationships that can be made into a
phylogenetic tree.
## ==== File paths input data ==== ##
if (exists("snakemake")) {
# Data from lofreq and varscan labeled with subclonal haplotypes
updated.data = snakemake@params[['incsv']]
# Spruce
spruce.data = snakemake@input[[1]]
# Counts of all reads bridging pairs of SNP positions
bridging.data = snakemake@input[[2]]
} else {
# Data from lofreq and varscan labeled with subclonal haplotypes
updated.data = "../../results/variants/validated_variants.csv"
# Spruce
spruce.data = "../../results/variants/spruce_output.tsv"
# Counts of all reads bridging pairs of SNP positions
bridging.data = "../../results/bridging/bridging_reads.csv"
}
## ==== File paths output data ==== ##
if (exists("snakemake")) {
# Path to save figures
figure.dir = paste0(snakemake@params[['figures']], "/")
# Path to save the polished assigned variants
output.path = snakemake@params[['final_tree']]
} else {
# Path to save figures
figure.dir = "../../results/figures/"
# Path to save the polished assigned variants
output.path = "../../results/phylogeny/filtered_spruce_tree.csv"
}
# Make the figure directory if it doesn't exist
if (!file.exists(figure.dir)) {
dir.create(figure.dir, recursive = TRUE)
print("Directory created.")
} else {
print("Directory already exists.")
}
## [1] "Directory already exists."
A set of phylogenetic relationships were fit to the data using the
algorithm SPRUCE. We’ll visualize these possible trees
below. Ultimately, the goal is filter these based on bridging reads.
Let’s visualize all of these trees below.
There are total of 36 trees that fit to the data. Below, I’ll use bridging reads to test the edges for these trees and filter them down to a final plausible set of trees.
First, we already have some knowledge about whether a cluster is
descended from either Genome 1 or Genome 2.
Let’s filter out trees based on this information.
cluster.backgrounds = read_csv(updated.data, show_col_types = F) %>%
# For consistency, change the names of G1, G2, and G-1-1
mutate(Haplotype = case_when(
Haplotype == "genome-1" ~ "genome 01",
Haplotype == "genome-1-1" ~ "genome 1",
Haplotype == "genome-2" ~ "genome 2",
T ~ Haplotype
)) %>%
select(Haplotype, Background) %>%
distinct() %>%
filter(!Haplotype %in% c("both", "subclonal")) %>%
add_row(Haplotype = "Anc", Background = "Anc")
trees = unique(tree.df$tree)
trees.with.genotype.violations = c()
for (tree.name in trees) {
# Get the edges from this tree
tree.edges = tree.df %>%
filter(tree == tree.name) %>%
select(from, to)
# Check if there is a violation in any of the edges
for (i in 1:nrow(tree.edges)) {
# Get the edge
edge = tree.edges[i,]
# Get the cluster names
from = str_replace_all(edge$from, "_", " ")
to = str_replace_all(edge$to, "_", " ")
# Get backgrounds
from.background = cluster.backgrounds %>%
filter(Haplotype == from) %>%
pull(Background)
to.background = cluster.backgrounds %>%
filter(Haplotype == to) %>%
pull(Background)
if (from == "genome 01" & to.background != "genome-1") {
print(paste("Violation in", tree.name, " -- from:", from, "to:", to))
trees.with.genotype.violations = c(trees.with.genotype.violations, tree.name)
} else if (from == "genome 2" & to.background != "genome-2") {
print(paste("Violation in", tree.name, " -- from:", from, "to:", to))
trees.with.genotype.violations = c(trees.with.genotype.violations, tree.name)
} else if (from.background == "genome-1" & to.background == "genome-2") {
print(paste("Violation in", tree.name, " -- from:", from, "to:", to))
trees.with.genotype.violations = c(trees.with.genotype.violations, tree.name)
} else if (from.background == "genome-2" & to.background == "genome-1") {
print(paste("Violation in", tree.name, " -- from:", from, "to:", to))
trees.with.genotype.violations = c(trees.with.genotype.violations, tree.name)
}
}
}
## [1] "Violation in tree 1 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 2 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 2 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 4 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 5 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 6 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 6 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 7 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 7 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 8 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 9 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 9 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 10 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 10 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 10 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 11 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 11 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 12 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 12 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 12 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 13 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 14 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 14 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 15 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 15 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 16 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 18 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 19 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 19 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 20 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 20 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 20 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 21 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 21 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 21 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 22 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 22 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 23 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 23 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 24 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 25 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 25 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 26 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 26 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 26 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 27 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 27 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 27 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 28 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 28 -- from: cluster 2 to: cluster 12"
## [1] "Violation in tree 28 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 28 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 29 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 29 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 29 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 30 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 30 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 30 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 30 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 31 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 31 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 32 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 32 -- from: genome 1 to: cluster 12"
## [1] "Violation in tree 32 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 33 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 33 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 34 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 35 -- from: genome 2 to: cluster 8"
## [1] "Violation in tree 35 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 35 -- from: cluster 9 to: cluster 6"
## [1] "Violation in tree 36 -- from: genome 1 to: cluster 13"
## [1] "Violation in tree 36 -- from: cluster 9 to: cluster 6"
trees.with.genotype.violations = unique(trees.with.genotype.violations)
There are 34 of 36 total trees that violate the bridging read support
for a given background (Genome 1 or
Genome 2).
The remaining 2 trees are still plausible. Can we determine which of these are more likely?
To determine if the remaining tress fit our data, I’ll see if there are cluster relationships (i.e. the tree predicts that one cluster is descended from another) that we can either confirm or disprove using our bridging reads over pairs of SNPs.
To do this, I’ll use the same approach that was used to assign
clusters to either Genome 1 or Genome 2.
However, in this version, I’ll be testing whether a cluster is descended
from another cluster, or if they arose independently.
For this approach, we need the individual mutations and their frequency, the mean frequency of every cluster in each tissue sample, and the bridging reads.
## `summarise()` has grouped output by 'Tissue', 'Haplotype'. You can override
## using the `.groups` argument.
## Joining with `by = join_by(snp_1)`
## Warning in left_join(., dplyr::rename(haplotype.labels, snp_1 = POS, hap_1 = Haplotype)): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 11224 of `x` matches multiple rows in `y`.
## ℹ Row 268 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Joining with `by = join_by(snp_2)`
## Warning in left_join(., dplyr::rename(haplotype.labels, snp_2 = POS, hap_2 = Haplotype)): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6412 of `x` matches multiple rows in `y`.
## ℹ Row 199 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
Let’s check if there are cluster relationships that we can confirm or deny by read support.
cluster.test.df = data.frame()
for (tree.name in unique(valid.trees.df$tree)) {
# Get the edges from this tree
tree.edges = valid.trees.df %>%
filter(tree == tree.name) %>%
select(from, to)
# Check the evidence for nesting between clusters that are linked on the tree
for (i in 1:nrow(tree.edges)) {
# Get the edge
edge = tree.edges[i,]
# Get the cluster names
from = str_replace_all(edge$from, "_", " ")
to = str_replace_all(edge$to, "_", " ")
# Check if it's a cluster is predicted to descend from another cluster
if(str_detect(from, "cluster") & str_detect(to, "cluster")) {
# Status statement
print(paste("Checking if", to, "is descended from", from, "for", tree.name))
# Filter the bridging reads for overlaps between the clusters being tested
snp.pairs = labled.bridging.df %>%
filter((hap_1 == to & hap_2 == from) | (hap_1 == from & hap_2 == to)) %>%
select(snp_1, snp_2) %>%
distinct()
# Check if there are any bridging to test
if (nrow(snp.pairs) == 0) {
# There aren't any bridging reads overlapping these SNPs
linkage.information = "no information"
print(linkage.information)
# Write to a row
row = tibble(snp_1 = NA, snp_2 = NA,
hap_1 = NA, hap_2 = NA,
from = from, to = to,
lik_related = NA, lik_unrelated = NA,
tree = tree.name, relationship = linkage.information)
cluster.test.df = rbind(cluster.test.df, row)
# Go to the next haplotypes
next
}
for (i in 1:nrow(snp.pairs)) {
# Get the positions of the SNPs
snp.1 = snp.pairs[i,]$snp_1
snp.2 = snp.pairs[i,]$snp_2
# Print the SNP overlaps being checked
print(paste("Checking SNPs:", snp.1, "v.", snp.2))
# Get the cluster names
snp.1.cluster = haplotype.labels %>%
filter(POS == snp.1) %>%
pull()
snp.2.cluster = haplotype.labels %>%
filter(POS == snp.2) %>%
pull()
# Get the overlaps
overlaps = labled.bridging.df %>%
filter(snp_1 == snp.1, snp_2 == snp.2)
# Iterate over each tissue in the overlaps
tissue.test = foreach(i = 1:nrow(overlaps), .combine = "rbind") %do% {
# Get the tissue
tissue = overlaps[i, ]$Tissue
# Get the cluster frequencies
f.anc = max(0, (haplotype.mean %>% filter(Haplotype == from, Tissue == tissue))$AF)
f.dec = max(0, (haplotype.mean %>% filter(Haplotype == to, Tissue == tissue))$AF)
# Overlaps to test in this tissue
toTest = overlaps %>%
filter(Tissue == tissue)
# x11 refers to the number of reads overlapping both
x11 = toTest$`11`
if (snp.1.cluster == from) {
# x10 is the number of reads overlapping just descendant
x10 = toTest$`10`
# x01 is the number of reads overlapping just ancestor
x01 = toTest$`01`
} else {
# x10 is the number of reads overlapping just descendant (01 in data)
x10 = toTest$`01`
# x01 is the number of reads overlapping just ancestor (10 in data)
x01 = toTest$`10`
}
# x00 is the number of reads overlapping neither
x00 = toTest$`00`
# There are two possibilities
# 1: The inferred descendant does arise on the ancestor
# 2: The inferred descendant and ancestor arose independently
error = 0.01
# Descendant and ancestor should be all of the descendant reads
f11 = (f.dec + error)/(1 + 4*error)
# Only the descendant should be just the error
f10 = (0 + error)/(1 + 4*error)
# Only the ancestor should be the frequency of the ancestor minus the descendant
f01 = max(0 + error, (f.anc - f.dec + error))/(1 + 4*error)
# The remainder should be neither
f00 = 1 - f11 - f10 - f01
# What is the probability of observing these counts given the haplotypes are nested?
lik.nested = -dmultinom(c(x11, x10, x01, x00), prob = c(f11, f10, f01, f00), log = T)
# We should not see any reads with both
f11 = (0 + error)/(1 + 4*error)
# Just the ancestor
f01 = (f.anc + error)/(1 + 4*error)
# Just the descendant
f10 = (f.dec + error)/(1 + 4*error)
# The remainder should be the subtraction
f00 = 1 - f11 - f01 - f10
# What is the probability of observing these counts given the haplotypes are not nested?
lik.independant = -dmultinom(c(x11, x10, x01, x00), prob = c(f11, f10, f01, f00), log = T)
# Results
results = c(lik.nested, lik.independant)
}
AICs = 2 * apply(tissue.test, 2, sum, na.rm = TRUE)
AIC.min = min(AICs)
rel.related = exp((AIC.min - AICs[1])/2)
rel.unrelated = exp((AIC.min - AICs[2])/2)
lik.related = rel.related/(rel.related + rel.unrelated)
lik.unrelated = rel.unrelated/(rel.related + rel.unrelated)
linkage.information = c("Nesting", "Non-nesting", "neither")[c(lik.related > 0.95, lik.unrelated > 0.95,
lik.related < 0.95 & lik.unrelated < 0.95)]
print(linkage.information)
# Write to a row
row = tibble(snp_1 = snp.1, snp_2 = snp.2,
hap_1 = snp.1.cluster, hap_2 = snp.2.cluster,
from = from, to = to,
lik_related = lik.related, lik_unrelated = lik.unrelated,
tree = tree.name, relationship = linkage.information)
cluster.test.df = rbind(cluster.test.df, row)
}
}
}
}
## [1] "Checking if cluster 5 is descended from cluster 1 for tree 3"
## [1] "no information"
## [1] "Checking if cluster 5 is descended from cluster 1 for tree 17"
## [1] "no information"
## [1] "Checking if cluster 8 is descended from cluster 2 for tree 17"
## [1] "Checking SNPs: 2343 v. 2543"
## [1] "Non-nesting"
## [1] "Checking SNPs: 2349 v. 2543"
## [1] "Non-nesting"
## [1] "Checking SNPs: 5368 v. 5405"
## [1] "Non-nesting"
It looks like we can confidently say that cluster 8 is
not descended from cluster 2. This means
that can rule out all but a single single valid tree.
There is only 1 tree with violations and that’s tree 17.
Finally, now that we know how the haplotypes are related to one another, we can re-name them to something that is a little more clear.
final.tree.df = final.tree.df %>%
mutate(to = case_when(
to == "cluster_1" ~ "cluster 1",
to == "cluster_2" ~ "cluster 2",
to == "cluster_3" ~ "cluster 3",
to == "cluster_4" ~ "cluster 4",
to == "cluster_5" ~ "cluster 1a",
to == "cluster_6" ~ "G-FC2",
to == "cluster_7" ~ "cluster 5",
to == "cluster_8" ~ "cluster 6",
to == "cluster_9" ~ "cluster 7",
to == "cluster_10" ~ "cluster 8",
to == "cluster_11" ~ "cluster 9",
to == "cluster_12" ~ "cluster 10",
to == "cluster_13" ~ "cluster 11",
to == "genome_01" ~ "G-01",
to == "genome_1" ~ "G-1",
to == "genome_2" ~ "G-2",
TRUE ~ to)) %>%
mutate(from = case_when(
from == "cluster_1" ~ "cluster 1",
from == "cluster_2" ~ "cluster 2",
from == "cluster_3" ~ "cluster 3",
from == "cluster_4" ~ "cluster 4",
from == "cluster_5" ~ "cluster 1a",
from == "cluster_6" ~ "G-FC2",
from == "cluster_7" ~ "cluster 5",
from == "cluster_8" ~ "cluster 6",
from == "cluster_9" ~ "cluster 7",
from == "cluster_10" ~ "cluster 8",
from == "cluster_11" ~ "cluster 9",
from == "cluster_12" ~ "cluster 10",
from == "cluster_13" ~ "cluster 11",
from == "genome_01" ~ "G-01",
from == "genome_1" ~ "G-1",
from == "genome_2" ~ "G-2",
TRUE ~ from))
haplotype.mean = haplotype.mean %>%
mutate(Haplotype = case_when(
Haplotype == "cluster 1" ~ "cluster 1",
Haplotype == "cluster 2" ~ "cluster 2",
Haplotype == "cluster 3" ~ "cluster 3",
Haplotype == "cluster 4" ~ "cluster 4",
Haplotype == "cluster 5" ~ "cluster 1a",
Haplotype == "cluster 6" ~ "G-FC2",
Haplotype == "cluster 7" ~ "cluster 5",
Haplotype == "cluster 8" ~ "cluster 6",
Haplotype == "cluster 9" ~ "cluster 7",
Haplotype == "cluster 10" ~ "cluster 8",
Haplotype == "cluster 11" ~ "cluster 9",
Haplotype == "cluster 12" ~ "cluster 10",
Haplotype == "cluster 13" ~ "cluster 11",
Haplotype == "genome 01" ~ "G-01",
Haplotype == "genome 1" ~ "G-1",
Haplotype == "genome 2" ~ "G-2",
TRUE ~ Haplotype))
haplotype.colors = c("#636361", "#989891", "#000000", "#22488F", "#B02B23", "#E1EAF6", "#A6C9DF", "#7AADD2","#5691C1", "#3871B0", "#67649F", "#8484B1", "#FFFCBA", "#F9D984", "#F4B460", "#EE9B4F", "#DE4B32")
names(haplotype.colors) = c("Anc", "G-01", "G-FC2", "G-1", "G-2", "cluster 1", "cluster 1a", "cluster 2", "cluster 3", "cluster 4", "cluster 5", "cluster 6", "cluster 7", "cluster 8", "cluster 9", "cluster 10", "cluster 11")
## [1] "Writing final tree to results/phylogeny/filtered_spruce_tree.csv"
Now we can confirm that cluster 5 is on the background of cluster 1. Let’s visualize this in the format of a ‘psuedo-mueller’ plot.